Choose the cp For the Tree & Prune the Tree
par(mfrow=c(1,2))
diabetes.tree2 <- tree(Outcome~., data = diabetes.df)
cv.diabetes <- cv.tree(diabetes.tree2)
plot(cv.diabetes$size, cv.diabetes$dev, type="b")
plot(cv.diabetes$k, cv.diabetes$dev, type="b")

summary(diabetes.tree)
## Call:
## rpart(formula = Outcome ~ ., data = diabetes.df, method = "class")
## n= 768
##
## CP nsplit rel error xerror xstd
## 1 0.24253731 0 1.0000000 1.0000000 0.04928752
## 2 0.10447761 1 0.7574627 0.8059701 0.04649230
## 3 0.01741294 2 0.6529851 0.7537313 0.04552694
## 4 0.01492537 5 0.6007463 0.7574627 0.04559916
## 5 0.01305970 9 0.5410448 0.7500000 0.04545421
## 6 0.01119403 12 0.4925373 0.7537313 0.04552694
## 7 0.01000000 15 0.4589552 0.7500000 0.04545421
##
## Variable importance
## Glucose BMI Age
## 39 17 12
## BloodPressure Insulin Pregnancies
## 10 8 6
## DiabetesPedigreeFunction SkinThickness
## 5 2
##
## Node number 1: 768 observations, complexity param=0.2425373
## predicted class=0 expected loss=0.3489583 P(node) =1
## class counts: 500 268
## probabilities: 0.651 0.349
## left son=2 (485 obs) right son=3 (283 obs)
## Primary splits:
## Glucose < 127.5 to the left, improve=63.36011, (0 missing)
## Age < 28.5 to the left, improve=33.99082, (0 missing)
## BMI < 29.85 to the left, improve=32.92453, (0 missing)
## Pregnancies < 6.5 to the left, improve=19.69295, (0 missing)
## Insulin < 121 to the left, improve=13.33926, (0 missing)
## Surrogate splits:
## Insulin < 121 to the left, agree=0.697, adj=0.177, (0 split)
## Age < 48.5 to the left, agree=0.665, adj=0.092, (0 split)
## BloodPressure < 81 to the left, agree=0.659, adj=0.074, (0 split)
## BMI < 39.75 to the left, agree=0.659, adj=0.074, (0 split)
## DiabetesPedigreeFunction < 1.149 to the left, agree=0.642, adj=0.028, (0 split)
##
## Node number 2: 485 observations, complexity param=0.01492537
## predicted class=0 expected loss=0.1938144 P(node) =0.6315104
## class counts: 391 94
## probabilities: 0.806 0.194
## left son=4 (271 obs) right son=5 (214 obs)
## Primary splits:
## Age < 28.5 to the left, improve=14.579100, (0 missing)
## BMI < 26.95 to the left, improve=10.747980, (0 missing)
## Glucose < 99.5 to the left, improve= 8.411871, (0 missing)
## Pregnancies < 6.5 to the left, improve= 8.228052, (0 missing)
## DiabetesPedigreeFunction < 0.659 to the left, improve= 5.950521, (0 missing)
## Surrogate splits:
## Pregnancies < 3.5 to the left, agree=0.802, adj=0.551, (0 split)
## BloodPressure < 71 to the left, agree=0.658, adj=0.224, (0 split)
## SkinThickness < 7.5 to the right, agree=0.627, adj=0.154, (0 split)
## Insulin < 7.5 to the right, agree=0.625, adj=0.150, (0 split)
## Glucose < 113.5 to the left, agree=0.598, adj=0.089, (0 split)
##
## Node number 3: 283 observations, complexity param=0.1044776
## predicted class=1 expected loss=0.385159 P(node) =0.3684896
## class counts: 109 174
## probabilities: 0.385 0.615
## left son=6 (76 obs) right son=7 (207 obs)
## Primary splits:
## BMI < 29.95 to the left, improve=18.584530, (0 missing)
## Glucose < 154.5 to the left, improve=15.229510, (0 missing)
## Age < 24.5 to the left, improve= 7.413805, (0 missing)
## DiabetesPedigreeFunction < 0.3165 to the left, improve= 5.911141, (0 missing)
## Pregnancies < 7.5 to the left, improve= 4.076871, (0 missing)
## Surrogate splits:
## Age < 21.5 to the left, agree=0.742, adj=0.039, (0 split)
## DiabetesPedigreeFunction < 0.1255 to the left, agree=0.735, adj=0.013, (0 split)
##
## Node number 4: 271 observations
## predicted class=0 expected loss=0.08487085 P(node) =0.3528646
## class counts: 248 23
## probabilities: 0.915 0.085
##
## Node number 5: 214 observations, complexity param=0.01492537
## predicted class=0 expected loss=0.3317757 P(node) =0.2786458
## class counts: 143 71
## probabilities: 0.668 0.332
## left son=10 (41 obs) right son=11 (173 obs)
## Primary splits:
## BMI < 26.35 to the left, improve=8.123435, (0 missing)
## Glucose < 99.5 to the left, improve=7.110539, (0 missing)
## Insulin < 142.5 to the left, improve=6.235009, (0 missing)
## DiabetesPedigreeFunction < 0.625 to the left, improve=3.627461, (0 missing)
## Age < 56.5 to the right, improve=2.267415, (0 missing)
## Surrogate splits:
## Age < 66.5 to the right, agree=0.813, adj=0.024, (0 split)
##
## Node number 6: 76 observations, complexity param=0.01119403
## predicted class=0 expected loss=0.3157895 P(node) =0.09895833
## class counts: 52 24
## probabilities: 0.684 0.316
## left son=12 (41 obs) right son=13 (35 obs)
## Primary splits:
## Glucose < 145.5 to the left, improve=5.112489, (0 missing)
## Age < 26.5 to the left, improve=2.823058, (0 missing)
## BMI < 23.2 to the left, improve=2.296651, (0 missing)
## Pregnancies < 1.5 to the left, improve=2.245614, (0 missing)
## Insulin < 177.5 to the right, improve=1.300847, (0 missing)
## Surrogate splits:
## BMI < 28.85 to the left, agree=0.632, adj=0.200, (0 split)
## Age < 23.5 to the left, agree=0.632, adj=0.200, (0 split)
## Pregnancies < 3.5 to the left, agree=0.592, adj=0.114, (0 split)
## Insulin < 44.5 to the right, agree=0.592, adj=0.114, (0 split)
## DiabetesPedigreeFunction < 0.2085 to the right, agree=0.592, adj=0.114, (0 split)
##
## Node number 7: 207 observations, complexity param=0.01741294
## predicted class=1 expected loss=0.2753623 P(node) =0.2695312
## class counts: 57 150
## probabilities: 0.275 0.725
## left son=14 (115 obs) right son=15 (92 obs)
## Primary splits:
## Glucose < 157.5 to the left, improve=6.956522, (0 missing)
## DiabetesPedigreeFunction < 0.309 to the left, improve=3.715521, (0 missing)
## BloodPressure < 61 to the right, improve=2.965253, (0 missing)
## Age < 24.5 to the left, improve=2.783779, (0 missing)
## Pregnancies < 7.5 to the left, improve=2.159170, (0 missing)
## Surrogate splits:
## Insulin < 183.5 to the left, agree=0.594, adj=0.087, (0 split)
## Age < 46.5 to the left, agree=0.589, adj=0.076, (0 split)
## DiabetesPedigreeFunction < 0.744 to the left, agree=0.585, adj=0.065, (0 split)
## BMI < 30.85 to the right, agree=0.580, adj=0.054, (0 split)
## BloodPressure < 103 to the left, agree=0.570, adj=0.033, (0 split)
##
## Node number 10: 41 observations
## predicted class=0 expected loss=0.04878049 P(node) =0.05338542
## class counts: 39 2
## probabilities: 0.951 0.049
##
## Node number 11: 173 observations, complexity param=0.01492537
## predicted class=0 expected loss=0.3988439 P(node) =0.2252604
## class counts: 104 69
## probabilities: 0.601 0.399
## left son=22 (55 obs) right son=23 (118 obs)
## Primary splits:
## Glucose < 99.5 to the left, improve=7.595901, (0 missing)
## DiabetesPedigreeFunction < 0.625 to the left, improve=6.657917, (0 missing)
## Insulin < 142.5 to the left, improve=5.270370, (0 missing)
## BloodPressure < 91 to the right, improve=1.895734, (0 missing)
## Age < 56.5 to the right, improve=1.572004, (0 missing)
## Surrogate splits:
## DiabetesPedigreeFunction < 0.098 to the left, agree=0.688, adj=0.018, (0 split)
##
## Node number 12: 41 observations
## predicted class=0 expected loss=0.1463415 P(node) =0.05338542
## class counts: 35 6
## probabilities: 0.854 0.146
##
## Node number 13: 35 observations, complexity param=0.01119403
## predicted class=1 expected loss=0.4857143 P(node) =0.04557292
## class counts: 17 18
## probabilities: 0.486 0.514
## left son=26 (21 obs) right son=27 (14 obs)
## Primary splits:
## Insulin < 14.5 to the left, improve=1.8666670, (0 missing)
## Age < 28.5 to the left, improve=1.4486770, (0 missing)
## BloodPressure < 74.5 to the right, improve=1.1958590, (0 missing)
## BMI < 25.55 to the right, improve=0.9657143, (0 missing)
## Pregnancies < 1.5 to the left, improve=0.9142857, (0 missing)
## Surrogate splits:
## SkinThickness < 7 to the left, agree=0.943, adj=0.857, (0 split)
## DiabetesPedigreeFunction < 0.315 to the left, agree=0.686, adj=0.214, (0 split)
## Age < 27.5 to the right, agree=0.657, adj=0.143, (0 split)
## Pregnancies < 1.5 to the right, agree=0.629, adj=0.071, (0 split)
## BloodPressure < 74.5 to the right, agree=0.629, adj=0.071, (0 split)
##
## Node number 14: 115 observations, complexity param=0.01741294
## predicted class=1 expected loss=0.3913043 P(node) =0.1497396
## class counts: 45 70
## probabilities: 0.391 0.609
## left son=28 (50 obs) right son=29 (65 obs)
## Primary splits:
## Age < 30.5 to the left, improve=3.911839, (0 missing)
## BloodPressure < 61 to the right, improve=3.635942, (0 missing)
## DiabetesPedigreeFunction < 0.421 to the left, improve=3.496289, (0 missing)
## BMI < 41.65 to the left, improve=2.717391, (0 missing)
## Pregnancies < 7.5 to the left, improve=2.660742, (0 missing)
## Surrogate splits:
## Pregnancies < 4.5 to the left, agree=0.800, adj=0.54, (0 split)
## BloodPressure < 71 to the left, agree=0.670, adj=0.24, (0 split)
## Insulin < 186 to the right, agree=0.617, adj=0.12, (0 split)
## BMI < 31.25 to the left, agree=0.591, adj=0.06, (0 split)
## DiabetesPedigreeFunction < 0.436 to the left, agree=0.591, adj=0.06, (0 split)
##
## Node number 15: 92 observations
## predicted class=1 expected loss=0.1304348 P(node) =0.1197917
## class counts: 12 80
## probabilities: 0.130 0.870
##
## Node number 22: 55 observations
## predicted class=0 expected loss=0.1818182 P(node) =0.07161458
## class counts: 45 10
## probabilities: 0.818 0.182
##
## Node number 23: 118 observations, complexity param=0.01492537
## predicted class=0 expected loss=0.5 P(node) =0.1536458
## class counts: 59 59
## probabilities: 0.500 0.500
## left son=46 (84 obs) right son=47 (34 obs)
## Primary splits:
## DiabetesPedigreeFunction < 0.561 to the left, improve=5.288515, (0 missing)
## BloodPressure < 85 to the right, improve=2.615248, (0 missing)
## Insulin < 142.5 to the left, improve=2.187185, (0 missing)
## Age < 57 to the right, improve=1.898327, (0 missing)
## BMI < 34.65 to the right, improve=1.812039, (0 missing)
## Surrogate splits:
## Insulin < 190.5 to the left, agree=0.746, adj=0.118, (0 split)
## BMI < 43.35 to the left, agree=0.729, adj=0.059, (0 split)
## SkinThickness < 44.5 to the left, agree=0.720, adj=0.029, (0 split)
##
## Node number 26: 21 observations
## predicted class=0 expected loss=0.3809524 P(node) =0.02734375
## class counts: 13 8
## probabilities: 0.619 0.381
##
## Node number 27: 14 observations
## predicted class=1 expected loss=0.2857143 P(node) =0.01822917
## class counts: 4 10
## probabilities: 0.286 0.714
##
## Node number 28: 50 observations, complexity param=0.01741294
## predicted class=0 expected loss=0.46 P(node) =0.06510417
## class counts: 27 23
## probabilities: 0.540 0.460
## left son=56 (40 obs) right son=57 (10 obs)
## Primary splits:
## BloodPressure < 61 to the right, improve=7.290000, (0 missing)
## Insulin < 199 to the right, improve=2.347937, (0 missing)
## BMI < 41.8 to the left, improve=2.014825, (0 missing)
## SkinThickness < 19.5 to the right, improve=1.300317, (0 missing)
## Pregnancies < 0.5 to the right, improve=1.284444, (0 missing)
##
## Node number 29: 65 observations
## predicted class=1 expected loss=0.2769231 P(node) =0.08463542
## class counts: 18 47
## probabilities: 0.277 0.723
##
## Node number 46: 84 observations, complexity param=0.0130597
## predicted class=0 expected loss=0.4047619 P(node) =0.109375
## class counts: 50 34
## probabilities: 0.595 0.405
## left son=92 (21 obs) right son=93 (63 obs)
## Primary splits:
## DiabetesPedigreeFunction < 0.2 to the left, improve=2.571429, (0 missing)
## Age < 54.5 to the right, improve=2.502165, (0 missing)
## BloodPressure < 85 to the right, improve=2.221581, (0 missing)
## BMI < 34.65 to the right, improve=2.114657, (0 missing)
## SkinThickness < 28 to the right, improve=1.895085, (0 missing)
## Surrogate splits:
## BloodPressure < 51 to the left, agree=0.786, adj=0.143, (0 split)
## SkinThickness < 47.5 to the right, agree=0.762, adj=0.048, (0 split)
##
## Node number 47: 34 observations
## predicted class=1 expected loss=0.2647059 P(node) =0.04427083
## class counts: 9 25
## probabilities: 0.265 0.735
##
## Node number 56: 40 observations, complexity param=0.01119403
## predicted class=0 expected loss=0.325 P(node) =0.05208333
## class counts: 27 13
## probabilities: 0.675 0.325
## left son=112 (31 obs) right son=113 (9 obs)
## Primary splits:
## BMI < 41.8 to the left, improve=2.711290, (0 missing)
## Insulin < 260 to the right, improve=2.453226, (0 missing)
## DiabetesPedigreeFunction < 0.311 to the left, improve=2.002381, (0 missing)
## BloodPressure < 73 to the right, improve=1.633333, (0 missing)
## Pregnancies < 0.5 to the right, improve=1.319231, (0 missing)
## Surrogate splits:
## BloodPressure < 84.5 to the left, agree=0.825, adj=0.222, (0 split)
## SkinThickness < 40.5 to the left, agree=0.800, adj=0.111, (0 split)
##
## Node number 57: 10 observations
## predicted class=1 expected loss=0 P(node) =0.01302083
## class counts: 0 10
## probabilities: 0.000 1.000
##
## Node number 92: 21 observations
## predicted class=0 expected loss=0.1904762 P(node) =0.02734375
## class counts: 17 4
## probabilities: 0.810 0.190
##
## Node number 93: 63 observations, complexity param=0.0130597
## predicted class=0 expected loss=0.4761905 P(node) =0.08203125
## class counts: 33 30
## probabilities: 0.524 0.476
## left son=186 (52 obs) right son=187 (11 obs)
## Primary splits:
## Pregnancies < 1.5 to the right, improve=3.117383, (0 missing)
## BloodPressure < 67 to the right, improve=2.603571, (0 missing)
## Age < 48 to the right, improve=2.309690, (0 missing)
## SkinThickness < 26.5 to the right, improve=1.928571, (0 missing)
## DiabetesPedigreeFunction < 0.4255 to the right, improve=1.728571, (0 missing)
## Surrogate splits:
## SkinThickness < 45.5 to the left, agree=0.857, adj=0.182, (0 split)
## Insulin < 193 to the left, agree=0.857, adj=0.182, (0 split)
## BMI < 44.55 to the left, agree=0.857, adj=0.182, (0 split)
##
## Node number 112: 31 observations
## predicted class=0 expected loss=0.2258065 P(node) =0.04036458
## class counts: 24 7
## probabilities: 0.774 0.226
##
## Node number 113: 9 observations
## predicted class=1 expected loss=0.3333333 P(node) =0.01171875
## class counts: 3 6
## probabilities: 0.333 0.667
##
## Node number 186: 52 observations, complexity param=0.0130597
## predicted class=0 expected loss=0.4038462 P(node) =0.06770833
## class counts: 31 21
## probabilities: 0.596 0.404
## left son=372 (40 obs) right son=373 (12 obs)
## Primary splits:
## BloodPressure < 67 to the right, improve=3.738462, (0 missing)
## Insulin < 11 to the right, improve=2.611571, (0 missing)
## SkinThickness < 26.5 to the right, improve=2.377855, (0 missing)
## BMI < 34.05 to the right, improve=2.377855, (0 missing)
## Age < 47.5 to the right, improve=2.286081, (0 missing)
##
## Node number 187: 11 observations
## predicted class=1 expected loss=0.1818182 P(node) =0.01432292
## class counts: 2 9
## probabilities: 0.182 0.818
##
## Node number 372: 40 observations
## predicted class=0 expected loss=0.3 P(node) =0.05208333
## class counts: 28 12
## probabilities: 0.700 0.300
##
## Node number 373: 12 observations
## predicted class=1 expected loss=0.25 P(node) =0.015625
## class counts: 3 9
## probabilities: 0.250 0.750
diabetes.prune <- prune(diabetes.tree, cp = 0.043)
plotcp(diabetes.tree)

Generate the Correlation Matrix Using corrplot & plotly
corr <- rcorr(as.matrix(diabetes.df))
corr
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00 0.13 0.14 -0.08
## Glucose 0.13 1.00 0.15 0.06
## BloodPressure 0.14 0.15 1.00 0.21
## SkinThickness -0.08 0.06 0.21 1.00
## Insulin -0.07 0.33 0.09 0.44
## BMI 0.02 0.22 0.28 0.39
## DiabetesPedigreeFunction -0.03 0.14 0.04 0.18
## Age 0.54 0.26 0.24 -0.11
## Outcome 0.22 0.47 0.07 0.07
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies -0.07 0.02 -0.03 0.54
## Glucose 0.33 0.22 0.14 0.26
## BloodPressure 0.09 0.28 0.04 0.24
## SkinThickness 0.44 0.39 0.18 -0.11
## Insulin 1.00 0.20 0.19 -0.04
## BMI 0.20 1.00 0.14 0.04
## DiabetesPedigreeFunction 0.19 0.14 1.00 0.03
## Age -0.04 0.04 0.03 1.00
## Outcome 0.13 0.29 0.17 0.24
## Outcome
## Pregnancies 0.22
## Glucose 0.47
## BloodPressure 0.07
## SkinThickness 0.07
## Insulin 0.13
## BMI 0.29
## DiabetesPedigreeFunction 0.17
## Age 0.24
## Outcome 1.00
##
## n= 768
##
##
## P
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 0.0003 0.0000 0.0236
## Glucose 0.0003 0.0000 0.1124
## BloodPressure 0.0000 0.0000 0.0000
## SkinThickness 0.0236 0.1124 0.0000
## Insulin 0.0416 0.0000 0.0137 0.0000
## BMI 0.6246 0.0000 0.0000 0.0000
## DiabetesPedigreeFunction 0.3535 0.0001 0.2534 0.0000
## Age 0.0000 0.0000 0.0000 0.0016
## Outcome 0.0000 0.0000 0.0715 0.0383
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies 0.0416 0.6246 0.3535 0.0000
## Glucose 0.0000 0.0000 0.0001 0.0000
## BloodPressure 0.0137 0.0000 0.2534 0.0000
## SkinThickness 0.0000 0.0000 0.0000 0.0016
## Insulin 0.0000 0.0000 0.2432
## BMI 0.0000 0.0000 0.3158
## DiabetesPedigreeFunction 0.0000 0.0000 0.3530
## Age 0.2432 0.3158 0.3530
## Outcome 0.0003 0.0000 0.0000 0.0000
## Outcome
## Pregnancies 0.0000
## Glucose 0.0000
## BloodPressure 0.0715
## SkinThickness 0.0383
## Insulin 0.0003
## BMI 0.0000
## DiabetesPedigreeFunction 0.0000
## Age 0.0000
## Outcome
t <- list(
family = "Arial",
size = 13,
color = 'white')
corrplot(corr$r, type = "upper", order = "hclust",tl.col = "black", tl.srt = 45)

p <- plot_ly(z = cor(data.matrix(diabetes.df)), x = colnames(diabetes.df), y = colnames(diabetes.df), type = "heatmap", colorscale = "Electric", paper_bgcolor = "black") %>%
layout(paper_bgcolor = "black", title = "Correlation Matrix", font = t)
p
## Warning: 'heatmap' objects don't have these attributes: 'paper_bgcolor'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'z', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'transpose', 'xtype', 'ytype', 'zsmooth', 'connectgaps', 'xgap', 'ygap', 'zhoverformat', 'hovertemplate', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'